home *** CD-ROM | disk | FTP | other *** search
- Path: xanth!mcnc!gatech!ukma!husc6!necntc!ncoast!allbery
- From: jbuck@epimass.EPI.COM (Joe Buck)
- Newsgroups: comp.sources.misc
- Subject: v03i066: Perl program for accumulating Usenet traffic statistics
- Message-ID: <2250@epimass.EPI.COM>
- Date: 29 Jun 88 02:54:03 GMT
- Sender: allbery@ncoast.UUCP
- Reply-To: jbuck@epimass.EPI.COM (Joe Buck)
- Organization: Entropic Processing, Inc., Cupertino, CA
- Lines: 230
- Approved: allbery@ncoast.UUCP
-
- Posting-number: Volume 3, Issue 66
- Submitted-by: "Joe Buck" <jbuck@epimass.EPI.COM>
- Archive-name: newspaths
-
- #! /bin/sh
- # This is a shell archive, meaning:
- # 1. Remove everything above the #! /bin/sh line.
- # 2. Save the resulting text in a file.
- # 3. Execute the file with /bin/sh (not csh) to create the files:
- # README
- # newspaths
- # newspaths.1
- # twoway
- # This archive created: Tue Jun 28 19:52:10 1988
- export PATH; PATH=/bin:$PATH
- if test -f 'README'
- then
- echo shar: will not over-write existing file "'README'"
- else
- cat << \SHAR_EOF > 'README'
- Here's the stuff I used to generate the map in article <2249@epimass.epi.com>.
- Have fun. If only it drew the maps too -- maybe Brian Reid can attach
- it to something.
-
- - Joe
-
- SHAR_EOF
- fi # end of overwriting check
- if test -f 'newspaths'
- then
- echo shar: will not over-write existing file "'newspaths'"
- else
- cat << \SHAR_EOF > 'newspaths'
- #! /usr/bin/perl
-
- # Copyright 1988 by Joseph T. Buck, jbuck@epimass.epi.com.
- # You may freely use and copy this program as long as you
- # leave my name here.
-
- # newspaths -- news path statistics gatherer
-
- # This perl program scans through all the news on your spool
- # (using the history file to find the articles) and prints
- # out a sorted list of frequencies that each pair of hosts
- # appears in the Path: headers. That is, it determines how,
- # on average, your news gets to you.
- #
- # If an argument is given, it is the name of a previous output
- # of this program. The figures are read in, and host pairs
- # from articles newer than the input file are added in.
- # So that this will work, the first line of the output of the
- # program is of the form
- # Last-ID: <5679@chinet.UUCP>
- # (without the # sign). It records the last Message-ID in the
- # history file; to add new articles, we skip in the history file
- # until we find it.
-
- $skip = 0;
- if ($#ARGV >= 0) {
- $ofile = $ARGV[0];
- die "Can't open $ofile!\n" unless open (of, $ofile);
- # First line must contain last msgid to use.
- $_ = <of>;
- ($key, $last_id) = split (' ');
- die "Invalid input file format!\n" if ($key ne "Last-ID:");
- $skip = 1;
- # Read in the old file.
- while (<of>) {
- ($cnt, $pair) = split(' ');
- $pcount{$pair} = $cnt;
- }
- }
- # Let's go.
-
- die "Can't open history file!\n" unless open (hist, "/usr/lib/news/history");
- die "Can't cd to news spool directory!\n" unless chdir ("/usr/spool/news");
-
- $np = $nlocal = 0;
- while (<hist>) {
- #
- # $_ contains a line from the history file. Parse it.
- # Skip it if the article has been cancelled or expired
- # If the $skip flag is true, we skip until we have the right msgid
- #
- ($id, $date, $time, $file) = split (' ');
- next if ($file eq 'cancelled' || $file eq '');
- if ($skip) {
- if ($id eq $last_id) { $skip = 0; }
- next;
- }
- #
- # format of field is like comp.sources.unix/2345 . Get ng and filename.
- #
- ($ng, $n) = split (/\//, $file);
- $file =~ tr%.%/%;
- #
- # The following may be used to skip any local groups. Here, we
- # skip group names beginning with "epi" or "su". Change to suit taste.
- #
- next if $ng =~ /^epi|^su/;
- next unless open (art, $file); # skip if cannot open file
- #
- # Article OK. Get its path.
- while (<art>) {
- ($htype, $hvalue) = split (' ');
- if ($htype eq "Path:") {
- # We have the path, in hvalue.
- $np++;
- @path = split (/!/, $hvalue);
- # Handle locally posted articles.
- if ($#path < 2) { $nlocal++; last;}
- # Create and count pairs.
- for ($i = 0; $i < $#path - 1; $i++) {
- $pair = $path[$i] . "!" . $path[$i+1];
- $pcount{$pair} += 1;
- }
- last;
- }
- }
- }
- # Make sure print message comes out before sort data.
- $| = 1;
- print "Last-ID: $id\n";
- $| = 0;
- # write the data out, sorted. Open a pipe.
- die "Can't exec sort!\n" unless open (sort, "|sort -nr");
-
- while (($pair, $n) = each (pcount)) {
- printf sort ("%6d %s\n", $n, $pair);
- }
- close sort;
- SHAR_EOF
- fi # end of overwriting check
- if test -f 'newspaths.1'
- then
- echo shar: will not over-write existing file "'newspaths.1'"
- else
- cat << \SHAR_EOF > 'newspaths.1'
- .TH NEWSPATHS 1 5/31/88
- .SH "NAME"
- newspaths \- collect host connectivity information from Usenet articles
- .SH "SYNOPSIS"
- .B newspaths
- [
- .I prev_out
- ]
- .SH "DESCRIPTION"
- .I Newspaths
- scans the history file, in
- .IR /usr/lib/news/history ,
- using it to locate Usenet articles. For each article, the Path:
- header is read, and split up into its constituent host pairs.
- An associative array keeps track of how many times each pair of
- hosts appears in a Path: header. Every article in the history file is
- scanned.
- .PP
- The output format consists of the last Message-ID in the history file,
- followed by a sorted list of host pairs and counts. Here's a sample:
- .sp
- .nf
- Last-ID: <24524@oliveb.olivetti.com>
- 42814 epimass!pyramid
- 22610 pyramid!ames
- 12824 pyramid!decwrl
- 7854 ames!mailrus
- 5426 oliveb!ames
- 5391 ames!pasteur
- 5212 decwrl!ucbvax
- < lots more >
- .fi
- .PP
- If an input file argument is given, it should be the name of a previous
- output of
- .IR newspaths .
- The file and counts are read in, and the history file is scanned until
- the Last-ID is found. Normal processing begins with the next
- article given in the history file. The idea is to be able to continue
- to accumulate the counts over long periods of time (despite
- .I expire
- runs) and to count each article only once. It is an error if the
- given Message-ID is not found in the history file.
- .SH "SUGGESTED USE"
- Every night, from
- .IR cron ,
- do
- .nf
- newspaths old-output > new-output
- mv new-output old-output
- .fi
- .sp
- After 1 month of operation at
- .I epimass ,
- the resulting output is 135K bytes long.
- .SH AUTHOR
- Joseph T. Buck, jbuck@epimass.epi.com
- SHAR_EOF
- fi # end of overwriting check
- if test -f 'twoway'
- then
- echo shar: will not over-write existing file "'twoway'"
- else
- cat << \SHAR_EOF > 'twoway'
- #! /usr/bin/perl
- #
- # twoway -- convert unidirectional counts from "newspaths" into two-way counts
- # Usage: twoway file
- # where file is an output from "newspaths".
-
- $_ = <>; # Skip the Last-ID: line
- while (<>) {
- ($n, $pair) = split (' ');
- ($a, $b) = split (/!/, $pair);
- if ($a gt $b) { $pair = $b . "!" . $a; } # alphabetical order
- $count{$pair} += $n;
- }
- die "Can't exec sort!\n" unless open (sort, "|sort -nr");
- while (($pair, $n) = each (count)) {
- printf sort ("%5d %s\n", $n, $pair);
- }
- close sort;
- SHAR_EOF
- chmod +x 'twoway'
- fi # end of overwriting check
- # End of shell archive
- exit 0
- --
- - Joe Buck {uunet,ucbvax,pyramid,<smart-site>}!epimass.epi.com!jbuck
- jbuck@epimass.epi.com Old Arpa mailers: jbuck%epimass.epi.com@uunet.uu.net
- If you leave your fate in the hands of the gods, don't be
- surprised if they have a few grins at your expense. - Tom Robbins
-